Comparative genomics between two version of cassava by using primary transcript information

1. Download and set parameter for BLAST

In [1]:
!mkdir -p input
# !gunzip input/Mesculenta_305_v6.1.cds_primaryTranscriptOnly.fa.gz
# !gunzip input/Mesculenta_671_v8.1.cds_primaryTranscriptOnly.fa.gz
In [2]:
num_core = 20
blast_type = "nucl"
input_file_old = "input/Mesculenta_305_v6.1.cds_primaryTranscriptOnly.fa"
input_file_new = "input/Mesculenta_671_v8.1.cds_primaryTranscriptOnly.fa"

2. First comparative using BLAST

2.1 Runing BLAST

In [3]:
# Create database index sequence
!makeblastdb -in $input_file_new -dbtype $blast_type -out input/new -title new

# Runing BLAST
!blastn -db input/new \
        -query $input_file_old \
        -evalue 1e-10 \
        -subject_besthit \
        -max_hsps 1 \
        -max_target_seqs 1 \
        -num_threads $num_core  \
        -outfmt "6 std qcovhsp" \
        -out out_blastn_db_new__query_old.txt

Building a new DB, current time: 05/09/2021 03:49:10
New DB name:   /home_sbi_cold/nattawet/work2021/genome_conversion/input/new
New DB title:  new
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 32805 sequences in 1.31391 seconds.


Warning: [blastn] Examining 5 or more matches is recommended
In [4]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.express as px

first_BLASTp = pd.read_table("out_blastn_db_new__query_old.txt", 
                             names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovhsp"], 
                             header=None)
print("First BLAST result\nNumber of query sequences (new): %d\nNumber of subject sequences (old): %d" % (len(first_BLASTp.qseqid.unique()), len(first_BLASTp.sseqid.unique())))
First BLAST result
Number of query sequences (new): 29760
Number of subject sequences (old): 28219

2.2 Filtering of 1st BLAST

In [5]:
# sns.jointplot(data=first_BLASTp, x="qcovhsp", y="pident")
import plotly.express as px

fig = px.scatter(
    first_BLASTp,
    x="qcovhsp",
    y="pident",
    marginal_x="histogram",
    marginal_y="histogram",
    hover_data = {
        "qseqid": True,
        "sseqid": True,
    },
    width=500, 
    height=500)
fig.show()
In [6]:
first_BLASTp_filter_criteria = pd.DataFrame(columns = ["pident", "qcovhsp", "num_genes"])
for pident in range(0, 101, 5):
    for qcovhsp in range(0, 101, 5):
        first_BLASTp_filter_criteria = first_BLASTp_filter_criteria \
            .append({'pident': pident, 
                     'qcovhsp': qcovhsp,
                     'num_genes': first_BLASTp[(first_BLASTp.pident >= pident) & (first_BLASTp.qcovhsp >= qcovhsp)].shape[0]
                    }, ignore_index=True)

df = first_BLASTp_filter_criteria.pivot(index='pident', columns='qcovhsp', values='num_genes')
fig = px.imshow(df,
               labels=dict(x="> % Query coverage grather than", y="> % Identity grather than", color="Number of genes"),
               width=500, height=500)
fig.show()
In [7]:
first_BLASTp = first_BLASTp[(first_BLASTp.qcovhsp > 80) & (first_BLASTp.pident > 80)]
print("First BLAST result\nNumber of query sequences (new): %d\nNumber of subject sequences (old): %d" % (len(first_BLASTp.qseqid.unique()), len(first_BLASTp.sseqid.unique())))
First BLAST result
Number of query sequences (new): 27507
Number of subject sequences (old): 26660

3. Second comparative using BLAST

3.1 Runing BLAST

In [8]:
!makeblastdb -in $input_file_old -dbtype $blast_type -out input/old -title old

!blastn -db input/old \
        -query $input_file_new \
        -evalue 1e-10 \
        -subject_besthit \
        -max_hsps 1 \
        -max_target_seqs 1 \
        -num_threads 20  \
        -outfmt "6 std qcovhsp" \
        -out out_blastn_db_old__query_new.txt

Building a new DB, current time: 05/09/2021 03:50:01
New DB name:   /home_sbi_cold/nattawet/work2021/genome_conversion/input/old
New DB title:  old
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 33033 sequences in 1.46092 seconds.


Warning: [blastn] Examining 5 or more matches is recommended
In [9]:
second_BLASTp = pd.read_table("out_blastn_db_old__query_new.txt", 
                             names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovhsp"], 
                             header=None)
print("First BLAST result\nNumber of query sequences (old): %d\nNumber of subject sequences (new): %d" % (len(second_BLASTp.qseqid.unique()), len(second_BLASTp.sseqid.unique())))
First BLAST result
Number of query sequences (old): 30288
Number of subject sequences (new): 27767

3.2 Filtering BLAST result

In [10]:
# sns.jointplot(data=second_BLASTp, x="qcovhsp", y="pident")
import plotly.express as px

fig = px.scatter(
    second_BLASTp,
    x="qcovhsp",
    y="pident",
    marginal_x="histogram",
    marginal_y="histogram",
    hover_data = {
        "qseqid": True,
        "sseqid": True,
    },
    width=500, 
    height=500)
fig.show()
In [11]:
second_BLASTp_filter_criteria = pd.DataFrame(columns = ["pident", "qcovhsp", "num_genes"])
for pident in range(0, 101, 5):
    for qcovhsp in range(0, 101, 5):
        second_BLASTp_filter_criteria = second_BLASTp_filter_criteria \
            .append({'pident': pident, 
                     'qcovhsp': qcovhsp,
                     'num_genes': second_BLASTp[(second_BLASTp.pident >= pident) & (second_BLASTp.qcovhsp >= qcovhsp)].shape[0]
                    }, ignore_index=True)

df = second_BLASTp_filter_criteria.pivot(index='pident', columns='qcovhsp', values='num_genes')
fig = px.imshow(df,
               labels=dict(x="% Query coverage grather than", y="> % Identity grather than", color="Number of genes"),
               width=500, 
               height=500)
fig.show()
In [12]:
second_BLASTp = second_BLASTp[(second_BLASTp.qcovhsp > 80) & (second_BLASTp.pident > 80)]
print("First BLAST result\nNumber of query sequences (old): %d\nNumber of subject sequences (new): %d" % (len(second_BLASTp.qseqid.unique()), len(second_BLASTp.sseqid.unique())))
First BLAST result
Number of query sequences (old): 27147
Number of subject sequences (new): 25601

4. Double best hit (DBH) filtering

In [13]:
DBH_blast_result = first_BLASTp \
    .filter(["qseqid", "sseqid"]) \
    .drop_duplicates() \
    .merge(second_BLASTp.filter(["qseqid", "sseqid"]).drop_duplicates(),
           how='left',
           left_on="qseqid",
           right_on="sseqid",
           suffixes=["_1st_BLASTp", "_2nd_BLASTp"]
          ) \
    .assign(DBH = lambda x: (x.sseqid_1st_BLASTp == x.qseqid_2nd_BLASTp) & (x.qseqid_1st_BLASTp == x.sseqid_2nd_BLASTp)) \
    .sort_values('DBH', ascending=False) \
    .filter(["qseqid_1st_BLASTp", "sseqid_1st_BLASTp", "DBH"]) \
    .rename(columns={"qseqid_1st_BLASTp": "old_transcript_id", 
                     "sseqid_1st_BLASTp": "new_transcript_id"}) \
    .assign(old_gene_id = lambda x: x.old_transcript_id.replace(".\d+$", "", regex=True),
            new_gene_id = lambda x: x.new_transcript_id.replace(".\d+$", "", regex=True)) \
    .assign(gene_match=lambda x: x.new_gene_id == x.old_gene_id)
DBH_blast_result
Out[13]:
old_transcript_id new_transcript_id DBH old_gene_id new_gene_id gene_match
14475 Manes.06G084500.1 Manes.06G084500.1 True Manes.06G084500 Manes.06G084500 True
18703 Manes.07G069900.1 Manes.07G069900.1 True Manes.07G069900 Manes.07G069900 True
18731 Manes.07G131600.1 Manes.07G131600.1 True Manes.07G131600 Manes.07G131600 True
18730 Manes.07G072800.1 Manes.07G072800.2 True Manes.07G072800 Manes.07G072800 True
18728 Manes.07G034000.1 Manes.07G034000.1 True Manes.07G034000 Manes.07G034000 True
... ... ... ... ... ... ...
4140 Manes.02G048600.1 Manes.02G048600.1 False Manes.02G048600 Manes.02G048600 True
17046 Manes.15G192800.1 Manes.15G192766.1 False Manes.15G192800 Manes.15G192766 False
10390 Manes.16G000500.1 Manes.16G000500.4 False Manes.16G000500 Manes.16G000500 True
10386 Manes.16G016200.1 Manes.16G016200.1 False Manes.16G016200 Manes.16G016200 True
0 Manes.S111100.1 Manes.02G220660.1 False Manes.S111100 Manes.02G220660 False

28950 rows × 6 columns

In [14]:
DBH_blast_result \
    .groupby('DBH') \
    .size()
Out[14]:
DBH
False     3916
True     25034
dtype: int64
In [15]:
DBH_blast_result.groupby(["DBH", "gene_match"]).size()
Out[15]:
DBH    gene_match
False  False          2179
       True           1737
True   False          1548
       True          23486
dtype: int64
In [16]:
!wget -q https://raw.githubusercontent.com/evolu-tion/GenomeManagement/master/seq_manage.py
In [17]:
from seq_manage import Gff_manager

def getOrderGeneFromChr(fileName):
    gff = Gff_manager(fileName)
    df = pd.DataFrame(gff.getTableSpecificType("gene"), 
                      columns=["chr", "source", "feature", "start", "end", "score", "strand", "frame", "attr"])
    df[["ID", "GeneID", "ancestorIdentifier"]] = pd.DataFrame(df["attr"].to_list())
    df = df.replace('^[A-Za-z]+=','',regex=True) \
        .drop(columns=["attr"]) \
        .sort_values(by = ["chr", "start", "GeneID"], ignore_index = True) \
        .rename_axis("Order").reset_index()
    return df[["GeneID", "Order"]].set_index("GeneID")

tbl_gene_new = getOrderGeneFromChr("input/Mesculenta_671_v8.1.gene.gff3.gz")
tbl_gene_old = getOrderGeneFromChr("input/Mesculenta_305_v6.1.gene.gff3.gz")
In [18]:
DBH_blast_result = DBH_blast_result \
    .merge(tbl_gene_new, 
         how = 'left', 
         left_on = "new_gene_id",
         right_index = True) \
    .merge(tbl_gene_old, 
             how = 'left', 
             left_on = "old_gene_id",
             right_index = True,
             suffixes = ("_new", "_old")) \
    .assign(new_chr = DBH_blast_result.new_gene_id.replace('.\d+$', '', regex=True).replace('Manes.', 'Chr', regex=True))
In [19]:
import plotly.express as px

fig = px.scatter(
    DBH_blast_result.sort_values("new_transcript_id"),
    x="Order_new",
    y="Order_old",
    color = "new_chr",
    hover_data = {
        "new_transcript_id" : True,
        "old_transcript_id" : True,
        "Order_new": False,
        "Order_old": False,
        "new_chr": False,
        "DBH": True
    },
    width=600,
    height=500,
    labels=dict(
        Order_new="Gene ID (New version)",
        Order_old="Gene ID (Old version)",
        new_chr="Chromosome of new version" 
    )
)
fig.show()
In [20]:
DBH_blast_result.to_csv("DBH_BLASTn_old_new.txt", sep='\t', index=False)